blasteo <- function(filename, pathPan){
query <- paste('-query', paste(pathPan, filename, sep = '/'))
subject <- paste('-subject', paste(pathPan, filename, sep = '/'))
cleanFileName <- gsub(".faa", '', filename)
fileOut <- paste(cleanFileName, '-blastp.out', sep= '')
out <- paste('-out', fileOut)
blastp <- paste('blastp', query, subject, '-num_threads 5 -outfmt 7 -max_hsps 1 -use_sw_tback', out)
system(blastp)
return(0)
}
blasteo('ABC.faa', '/export/storage/users/aescobed/Bioinfo_LCG/Modulo-4/Proyecto')#Solo se mantienen aquellas lineas que no sean comentarios
quitComments <- function(line){
if (startsWith(x = line, prefix= '#') == FALSE) {
return(line)}
else {
return(NA)}
}
#Convertir a un vector una linea separada por tabs
splitLines <- function(line) {
splitedLine <- as.vector(str_split(line, pattern= '\t', simplify =TRUE))
return (splitedLine)
}
#Convertir un archivo output de blastp (tabular con comentarios) a un data frame sin comentarios
fileToDataFrame <- function(filename){
fileContent <- readLines(filename)
cleanFileContent <- as.vector(sapply(fileContent, quitComments))
cleanestFileContent <- cleanFileContent[!is.na(cleanFileContent)]
fragmentedFileContent <- sapply(cleanestFileContent, splitLines)
preDataframeFileContent <- as.data.frame(fragmentedFileContent)
if (dim(preDataframeFileContent)[1] > 0){
names(preDataframeFileContent) <- NULL
dataFrameFileContent <- as.data.frame(t(preDataframeFileContent))
names(dataFrameFileContent) <- c('query_acc.ver', 'subject_acc.ver', '%_identity', 'alignment_length', 'mismatches', 'gap_opens', 'q._start', 'q._end', 's._start', 's. end', 'evalue', 'bit_score')
return(dataFrameFileContent)}
else {
return(data.frame())}
}
data <- fileToDataFrame('ABC-blastp.out')
datadisMatrix <- matrix(rep(0,10000), nrow= 100, ncol=100)
protsNames <- unique(data$query_acc.ver)
colnames(disMatrix) <- protsNames
row.names(disMatrix) <- protsNames
for (protA in protsNames) {
for (protB in protsNames) {
if (protA != protB){
filterData <- filter(data, query_acc.ver== protA, subject_acc.ver == protB)
if (dim(filterData)[1] != 0){
bitScore <- filterData$bit_score[1]
disMatrix[protA, protB] <- as.numeric(bitScore)}
else{
disMatrix[protA, protB] <- as.numeric(0)}
}
else{
disMatrix[protA, protB] <- as.numeric(1)}
}
}
disMatrix[1:10, 1:10] ABC1-Q03024 ABC1-C1DS84 ABC1-Q54456 ABC1-Q9RHT2 ABC1-Q9ZG94
ABC1-Q03024 1 212 194 190 183
ABC1-C1DS84 199 1 199 200 191
ABC1-Q54456 192 212 1 193 189
ABC1-Q9RHT2 189 212 194 1 189
ABC1-Q9ZG94 187 206 193 191 1
ABC1-O67184 182 202 176 166 158
ABC1-Q9Z3C5 179 201 185 155 165
ABC1-O05693 173 184 174 161 173
ABC1-P23596 169 197 184 185 172
ABC1-O85350 167 189 173 162 160
ABC1-O67184 ABC1-Q9Z3C5 ABC1-O05693 ABC1-P23596 ABC1-O85350
ABC1-Q03024 198 181 175 168 185
ABC1-C1DS84 202 187 170 184 189
ABC1-Q54456 192 185 177 181 191
ABC1-Q9RHT2 182 157 166 184 183
ABC1-Q9ZG94 172 167 175 171 177
ABC1-O67184 1 183 162 167 180
ABC1-Q9Z3C5 200 1 170 152 188
ABC1-O05693 172 172 1 160 178
ABC1-P23596 184 155 162 1 162
ABC1-O85350 180 171 168 142 1
normDisMatrix <-matrix(rep(0,10000), nrow= 100, ncol=100)
colnames(normDisMatrix) <- protsNames
row.names(normDisMatrix) <- protsNames
maxBitScore <- max(apply(disMatrix, 1, max))
for (protA in protsNames) {
for (protB in protsNames) {
if (protA != protB){
norm <- disMatrix[protA, protB]/maxBitScore
dis <- 1- norm
normDisMatrix[protA, protB] <- dis
}
}
}
normDisMatrix[1:10, 1:10] ABC1-Q03024 ABC1-C1DS84 ABC1-Q54456 ABC1-Q9RHT2 ABC1-Q9ZG94
ABC1-Q03024 0.0000000 0.2262774 0.2919708 0.3065693 0.3321168
ABC1-C1DS84 0.2737226 0.0000000 0.2737226 0.2700730 0.3029197
ABC1-Q54456 0.2992701 0.2262774 0.0000000 0.2956204 0.3102190
ABC1-Q9RHT2 0.3102190 0.2262774 0.2919708 0.0000000 0.3102190
ABC1-Q9ZG94 0.3175182 0.2481752 0.2956204 0.3029197 0.0000000
ABC1-O67184 0.3357664 0.2627737 0.3576642 0.3941606 0.4233577
ABC1-Q9Z3C5 0.3467153 0.2664234 0.3248175 0.4343066 0.3978102
ABC1-O05693 0.3686131 0.3284672 0.3649635 0.4124088 0.3686131
ABC1-P23596 0.3832117 0.2810219 0.3284672 0.3248175 0.3722628
ABC1-O85350 0.3905109 0.3102190 0.3686131 0.4087591 0.4160584
ABC1-O67184 ABC1-Q9Z3C5 ABC1-O05693 ABC1-P23596 ABC1-O85350
ABC1-Q03024 0.2773723 0.3394161 0.3613139 0.3868613 0.3248175
ABC1-C1DS84 0.2627737 0.3175182 0.3795620 0.3284672 0.3102190
ABC1-Q54456 0.2992701 0.3248175 0.3540146 0.3394161 0.3029197
ABC1-Q9RHT2 0.3357664 0.4270073 0.3941606 0.3284672 0.3321168
ABC1-Q9ZG94 0.3722628 0.3905109 0.3613139 0.3759124 0.3540146
ABC1-O67184 0.0000000 0.3321168 0.4087591 0.3905109 0.3430657
ABC1-Q9Z3C5 0.2700730 0.0000000 0.3795620 0.4452555 0.3138686
ABC1-O05693 0.3722628 0.3722628 0.0000000 0.4160584 0.3503650
ABC1-P23596 0.3284672 0.4343066 0.4087591 0.0000000 0.4087591
ABC1-O85350 0.3430657 0.3759124 0.3868613 0.4817518 0.0000000
#Para clustering single
fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "single", method = "silhouette", k.max = 10) +
labs(subtitle = "Silhouette method")#Para clustering average
fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "average", method = "silhouette", k.max = 10) +
labs(subtitle = "Silhouette method")#Para clustering complete
fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "complete", method = "silhouette", k.max = 10) +
labs(subtitle = "Silhouette method")fviz_nbclust(normDisMatrix, FUN = hcut, hc_func = "hclust", hc_method = "ward.D2", method = "silhouette", k.max = 10) +
labs(subtitle = "Silhouette method")csin <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "single")
singleTree <- as.phylo(csin)
write.tree(phy=singleTree, file="single.tree")
cave <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "average")
averageTree <- as.phylo(cave)
write.tree(phy= averageTree, file= "average.tree")
ccom <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "complete")
completeTree <- as.phylo(ccom)
write.tree(phy= completeTree, file= "complete.tree")
cwar <- hclust(dist(normDisMatrix, method= 'euclidean'), method = "ward.D2")
wardTree <- as.phylo(cwar)
write.tree(phy= wardTree , file= "ward.tree")#Para clustering single
clustersSingle <- cutree(csin, k=5)
fviz_cluster(list(data = normDisMatrix, cluster = clustersSingle))#Para clustering average
clustersAverage <- cutree(cave, k=4)
fviz_cluster(list(data = normDisMatrix, cluster = clustersAverage))#Para clustering complete
clustersComplete <- cutree(ccom, k=4)
fviz_cluster(list(data = normDisMatrix, cluster = clustersAverage))# Para clustering Ward
clustersWard <- cutree(cwar, k=4)
fviz_cluster(list(data = normDisMatrix, cluster = clustersAverage))plot (csin, hang = -1, main = "Single")
rect.hclust(csin, k=5, border=1:16)plot (cave, hang = -1, main = "Average")
rect.hclust(cave, k=4, border=1:16)plot (ccom, hang = -1, main = "Complete")
rect.hclust(ccom, k=5, border=1:16)plot (cwar, hang = -1, main = "Ward.D")
rect.hclust(cwar, k=4, border=1:16)coefSingle <- coef(csin)
coefAverage <- coef(cave)
coefComplete <- coef(ccom)
coefWard <- coef(cwar)
dfCoefs <- data.frame(Metodo= c('Single', 'Average', 'Complete', 'Ward'), Coeficiente = c(coefSingle, coefAverage, coefComplete, coefWard))
dfCoefs